####### SCRIPT FOR: 
####### DISTANCE AND TRUST: 
####### EXAMINATION OF THE TWO OPPOSING FACTORS IMPACTING ADOPTION OF POSTAL VONTING AMONG INDIVIDUALS
#######
####### CALCULATING DISTANCES BETWEEN RESPONDENTS' ADDRESS AND NEAREST POLLING STATION
#######
####### NOTE: This code cannot be fully replicated for two reasons:
####### 1. Due to privacy concerns, we cannot share individuals' residential addresses
####### 2. The code requires functioning Google Maps API key. Since the service is paid,
#######    we restrain from sharing our API key used to calculate distances between 
#######    individuals and polling stations. Nevertheless, interested individuals are more
#######    than welcome to obtain their own key.



rm(list=ls())
gc()

setwd("C:/Users/miros/Desktop/Research/Adoption of Postal voting")




# ADD YOUR GOOGLE MAPS API KEY --------------------------------------------

library(gmapsdistance)
set.api.key("YOUR_GOOGLE_MAPS_API_KEY")




# DATA IMPORT: POLLING STATIONS -------------------------

# Importing addresses of polling stations
library(readxl)
Polling.Stations <- read_excel("03-data/FIN_polling_stations_abroad.xlsx")

# Replacing spaces " " with pluses "+", because that's what needs to be feeded to gmapsdistance()
Polling.Stations$Address_to_look_for <- gsub(" ", "+", Polling.Stations$Address_to_look_for, fixed = TRUE)




# DATA IMPORT: FACE RESPONDENTS -------------------------------------------

# Importing addresses of FACE respondents
# (Unfortunately, we are unable to share the database of participants' addresses due to privacy constraints)
FACE.resp.ALL <- read_excel("03-data/Sensitive data - NOT SHARED")

# Importing ID codes of those who took FACE survey (from FACE survey data)
library(haven)
FACE.survey.IDs <- read_sav("03-data/FACE data FINAL.sav")
FACE.survey.IDs <- data.frame(Q1 = FACE.survey.IDs$Q1, 
                              Q26 = FACE.survey.IDs$Q26)

# Selecting only those who filled in the distance from polling stations (to lower the number of demands to the Google Maps Server)
FACE.survey.IDs <- subset(FACE.survey.IDs, !is.na(Q26))
FACE.survey.IDs$Q26 <- NULL

# Adding participation variable to the database with addresses
FACE.survey.IDs <- data.frame(FACE.survey.IDs,
                              Participation = c(1))
FACE.survey.IDs <- merge(FACE.resp.ALL, FACE.survey.IDs, by.x = "id", by.y = "Q1")
# Now, "FACE.survey.IDs" includes only those who (1) have address in our database, 
# (2) participated in the FACE survey and (3) stated the distance from the nearest polling station

rm(FACE.resp.ALL)

# Creating a vector with addresses
FACE.survey.IDs$Address_to_look_for <- paste(FACE.survey.IDs$Address, FACE.survey.IDs$"ZIP and area", FACE.survey.IDs$Country, sep = ", ")

# Replacing spaces " " with pluses "+", because that's what needs to be feeded to google_distance()
FACE.survey.IDs$Address_to_look_for <- gsub(" ", "+", FACE.survey.IDs$Address_to_look_for, fixed = TRUE)
# Also replacing "#" with "" because "#" in address line crashes the Google Maps API access
FACE.survey.IDs$Address_to_look_for <- gsub("#", "", FACE.survey.IDs$Address_to_look_for, fixed = TRUE)




# CREATING COUNTRY CLUSTERS -------------------------------------------------------
# Adding only relevant countries with polling stations abroad based on Respondent's address
# Used are only polling stations in respondents country of residency or neighbouring countries
# (We try to minimize the number of requests to Google Maps Server, because it costs money)

cntry.pairing <- rbind(
data.frame(Expat.Cntry = c("AUSTRALIA"),            Polling.Cntry = c("Australia", "New Zealand")),
data.frame(Expat.Cntry = c("AUSTRIA"),              Polling.Cntry = c("Austria", "Czechia", "Germany", "Hungary", "Italy", "Slovenia", "Switzerland")),
data.frame(Expat.Cntry = c("BELGIUM"),              Polling.Cntry = c("Belgium", "France", "Germany", "Luxembourg", "Netherlands")),
data.frame(Expat.Cntry = c("BRAZIL"),               Polling.Cntry = c("Brazil", "Argentina", "Colombia", "Peru", "Uruguay")),
data.frame(Expat.Cntry = c("CANADA"),               Polling.Cntry = c("Canada", "United States")),
data.frame(Expat.Cntry = c("CHILE"),                Polling.Cntry = c("Chile", "Peru", "Argentina")),
data.frame(Expat.Cntry = c("CHINA"),                Polling.Cntry = c("China", "Afghanistan", "India", "Kazakhstan", "Myanmar", "Nepal", "Russian Federation", "Viet Nam")),
data.frame(Expat.Cntry = c("COSTA RICA"),           Polling.Cntry = c("Colombia", "Mexico")),
data.frame(Expat.Cntry = c("CROATIA"),              Polling.Cntry = c("Croatia", "Austria", "Hungary", "Serbia", "Slovenia")),
data.frame(Expat.Cntry = c("CYPRUS"),               Polling.Cntry = c("Cyprus", "Greece", "Turkey")),
data.frame(Expat.Cntry = c("CZECH REPUBLIC"),       Polling.Cntry = c("Czechia", "Austria", "Germany", "Poland")),
data.frame(Expat.Cntry = c("DENMARK"),              Polling.Cntry = c("Denmark", "Germany", "Sweden")),
data.frame(Expat.Cntry = c("ESTONIA"),              Polling.Cntry = c("Estonia", "Latvia", "Russian Federation")),
data.frame(Expat.Cntry = c("FRANCE"),               Polling.Cntry = c("France", "Belgium", "Germany", "Italy", "Luxembourg", "Spain", "Switzerland", "United Kingdom")),
data.frame(Expat.Cntry = c("GERMANY"),              Polling.Cntry = c("Germany", "Austria", "Belgium", "Czechia", "Denmark", "France", "Luxembourg", "Netherlands", "Poland", "Switzerland")),
data.frame(Expat.Cntry = c("GREECE"),               Polling.Cntry = c("Greece", "Bulgaria", "Cyprus", "Turkey")),
data.frame(Expat.Cntry = c("HONG KONG"),            Polling.Cntry = c("China", "Taiwan", "Viet Nam")),
data.frame(Expat.Cntry = c("ICELAND"),              Polling.Cntry = c("Iceland")),
data.frame(Expat.Cntry = c("INDONESIA"),            Polling.Cntry = c("Indonesia", "Malaysia", "Singapore")),
data.frame(Expat.Cntry = c("IRELAND"),              Polling.Cntry = c("Ireland", "United Kingdom")),
data.frame(Expat.Cntry = c("ISRAEL"),               Polling.Cntry = c("Israel", "Egypt", "Jordan", "Lebanon")),
data.frame(Expat.Cntry = c("ITALY"),                Polling.Cntry = c("Italy", "Austria", "France", "Slovenia", "Switzerland")),
data.frame(Expat.Cntry = c("JAPAN"),                Polling.Cntry = c("Japan", "Korea, Republic of")),
data.frame(Expat.Cntry = c("LATVIA"),               Polling.Cntry = c("Latvia", "Estonia", "Lithuania", "Russian Federation")),
data.frame(Expat.Cntry = c("LUXEMBOURG"),           Polling.Cntry = c("Luxembourg", "Belgium", "France", "Germany")),
data.frame(Expat.Cntry = c("MALTA"),                Polling.Cntry = c("Malta", "Italy")),
data.frame(Expat.Cntry = c("MEXICO"),               Polling.Cntry = c("Mexico", "United States")),
data.frame(Expat.Cntry = c("MONACO"),               Polling.Cntry = c("France", "Italy")),
data.frame(Expat.Cntry = c("NETHERLANDS"),          Polling.Cntry = c("Netherlands", "Luxembourg", "Belgium", "Germany")),
data.frame(Expat.Cntry = c("NEW ZEALAND"),          Polling.Cntry = c("New Zealand", "Australia")),
data.frame(Expat.Cntry = c("NORWAY"),               Polling.Cntry = c("Norway", "Sweden", "Russian Federation")),
data.frame(Expat.Cntry = c("PARAGUAY"),             Polling.Cntry = c("Brazil", "Argentina", "Uruguay")),
data.frame(Expat.Cntry = c("PHILIPPINES"),          Polling.Cntry = c("Philippines")),
data.frame(Expat.Cntry = c("POLAND"),               Polling.Cntry = c("Poland", "Czechia", "Germany", "Lithuania", "Ukraine", "Russian Federation")),
data.frame(Expat.Cntry = c("PORTUGAL"),             Polling.Cntry = c("Portugal", "Spain")),
data.frame(Expat.Cntry = c("RUSSIA"),               Polling.Cntry = c("Russian Federation", "Azerbaijan", "China", "Estonia", "Georgia", "Kazakhstan", "Latvia", "Norway", "Ukraine")),
data.frame(Expat.Cntry = c("SINGAPORE"),            Polling.Cntry = c("Singapore", "Indonesia", "Malaysia")),
data.frame(Expat.Cntry = c("SLOVENIA"),             Polling.Cntry = c("Slovenia", "Austria", "Croatia", "Italy", "Hungary")),
data.frame(Expat.Cntry = c("SOUTH AFRICA"),         Polling.Cntry = c("South Africa", "Mozambique", "Namibia", "Swaziland")),
data.frame(Expat.Cntry = c("SPAIN"),                Polling.Cntry = c("Spain", "Portugal", "France", "Morocco")),
data.frame(Expat.Cntry = c("SWEDEN"),               Polling.Cntry = c("Sweden", "Denmark", "Norway", "Russian Federation")),
data.frame(Expat.Cntry = c("SWITZERLAND"),          Polling.Cntry = c("Switzerland", "Austria", "Italy", "Germany", "France")),
data.frame(Expat.Cntry = c("TAIWAN"),               Polling.Cntry = c("Taiwan", "Myanmar", "Malaysia")),
data.frame(Expat.Cntry = c("THAILAND"),             Polling.Cntry = c("Thailand", "China")),
data.frame(Expat.Cntry = c("TURKEY"),               Polling.Cntry = c("Turkey", "Bulgaria", "Cyprus", "Greece", "Georgia", "Iran")),
data.frame(Expat.Cntry = c("UKRAINE"),              Polling.Cntry = c("Ukraine", "Hungary", "Poland", "Romania", "Russian Federation")),
data.frame(Expat.Cntry = c("UNITED ARAB EMIRATES"), Polling.Cntry = c("United Arab Emirates", "Qatar", "Saudi Arabia")),
data.frame(Expat.Cntry = c("UNITED KINGDOM"),       Polling.Cntry = c("United Kingdom", "Ireland", "Netherlands", "Belgium", "France")),
data.frame(Expat.Cntry = c("USA"),                  Polling.Cntry = c("United States", "Canada", "Mexico"))
)


#View(cntry.pairing)




# CALCULATING NUMBER OF REQUESTS NEEDED FROM GOOGLLE MAPS ------------------
distances.summary <- 
rbind(
data.frame(Country = "AUSTRALIA",            Respondents = sum(FACE.survey.IDs$Country == "AUSTRALIA"),            Polling.Stations = sum(Polling.Stations$Country == "Australia" | Polling.Stations$Country == "New Zealand")),
data.frame(Country = "AUSTRIA",              Respondents = sum(FACE.survey.IDs$Country == "AUSTRIA"),              Polling.Stations = sum(Polling.Stations$Country == "Austria"| Polling.Stations$Country == "Czechia"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Hungary"| Polling.Stations$Country == "Italy"| Polling.Stations$Country == "Slovenia"| Polling.Stations$Country == "Switzerland")),
data.frame(Country = "BELGIUM",              Respondents = sum(FACE.survey.IDs$Country == "BELGIUM"),              Polling.Stations = sum(Polling.Stations$Country == "Belgium"| Polling.Stations$Country == "France"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Luxembourg"| Polling.Stations$Country == "Netherlands")),
data.frame(Country = "BRAZIL",               Respondents = sum(FACE.survey.IDs$Country == "BRAZIL"),               Polling.Stations = sum(Polling.Stations$Country == "Brazil"| Polling.Stations$Country == "Argentina"| Polling.Stations$Country == "Colombia"| Polling.Stations$Country == "Peru"| Polling.Stations$Country == "Uruguay")),
data.frame(Country = "CANADA",               Respondents = sum(FACE.survey.IDs$Country == "CANADA"),               Polling.Stations = sum(Polling.Stations$Country == "Canada"| Polling.Stations$Country == "United States")),
data.frame(Country = "CHILE",                Respondents = sum(FACE.survey.IDs$Country == "CHILE"),                Polling.Stations = sum(Polling.Stations$Country == "Chile"| Polling.Stations$Country == "Peru"| Polling.Stations$Country == "Argentina")),
data.frame(Country = "CHINA",                Respondents = sum(FACE.survey.IDs$Country == "CHINA"),                Polling.Stations = sum(Polling.Stations$Country == "China"| Polling.Stations$Country == "Afghanistan"| Polling.Stations$Country == "India"| Polling.Stations$Country == "Kazakhstan"| Polling.Stations$Country == "Myanmar"| Polling.Stations$Country == "Nepal"| Polling.Stations$Country == "Russian Federation"| Polling.Stations$Country == "Viet Nam")),
data.frame(Country = "COSTA RICA",           Respondents = sum(FACE.survey.IDs$Country == "COSTA RICA"),           Polling.Stations = sum(Polling.Stations$Country == "Colombia"| Polling.Stations$Country == "Mexico")),
data.frame(Country = "CROATIA",              Respondents = sum(FACE.survey.IDs$Country == "CROATIA"),              Polling.Stations = sum(Polling.Stations$Country == "Croatia"| Polling.Stations$Country == "Austria"| Polling.Stations$Country == "Hungary"| Polling.Stations$Country == "Serbia"| Polling.Stations$Country == "Slovenia")),
data.frame(Country = "CYPRUS",               Respondents = sum(FACE.survey.IDs$Country == "CYPRUS"),               Polling.Stations = sum(Polling.Stations$Country == "Cyprus"| Polling.Stations$Country == "Greece"| Polling.Stations$Country == "Turkey")),
data.frame(Country = "CZECH REPUBLIC",       Respondents = sum(FACE.survey.IDs$Country == "CZECH REPUBLIC"),       Polling.Stations = sum(Polling.Stations$Country == "Czechia"| Polling.Stations$Country == "Austria"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Poland")),
data.frame(Country = "DENMARK",              Respondents = sum(FACE.survey.IDs$Country == "DENMARK"),              Polling.Stations = sum(Polling.Stations$Country == "Denmark"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Sweden")),
data.frame(Country = "ESTONIA",              Respondents = sum(FACE.survey.IDs$Country == "ESTONIA"),              Polling.Stations = sum(Polling.Stations$Country == "Estonia"| Polling.Stations$Country == "Latvia"| Polling.Stations$Country == "Russian Federation")),
data.frame(Country = "FRANCE",               Respondents = sum(FACE.survey.IDs$Country == "FRANCE"),               Polling.Stations = sum(Polling.Stations$Country == "France"| Polling.Stations$Country == "Belgium"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Italy"| Polling.Stations$Country == "Luxembourg"| Polling.Stations$Country == "Spain"| Polling.Stations$Country == "Switzerland"| Polling.Stations$Country == "United Kingdom")),
data.frame(Country = "GERMANY",              Respondents = sum(FACE.survey.IDs$Country == "GERMANY"),              Polling.Stations = sum(Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Austria"| Polling.Stations$Country == "Belgium"| Polling.Stations$Country == "Czechia"| Polling.Stations$Country == "Denmark"| Polling.Stations$Country == "France"| Polling.Stations$Country == "Luxembourg"| Polling.Stations$Country == "Netherlands"| Polling.Stations$Country == "Poland"| Polling.Stations$Country == "Switzerland")),
data.frame(Country = "GREECE",               Respondents = sum(FACE.survey.IDs$Country == "GREECE"),               Polling.Stations = sum(Polling.Stations$Country == "Greece"| Polling.Stations$Country == "Bulgaria"| Polling.Stations$Country == "Cyprus"| Polling.Stations$Country == "Turkey")),
data.frame(Country = "HONG KONG",            Respondents = sum(FACE.survey.IDs$Country == "HONG KONG"),            Polling.Stations = sum(Polling.Stations$Country == "China"| Polling.Stations$Country == "Taiwan"| Polling.Stations$Country == "Viet Nam")),
data.frame(Country = "ICELAND",              Respondents = sum(FACE.survey.IDs$Country == "ICELAND"),              Polling.Stations = sum(Polling.Stations$Country == "Iceland")),
data.frame(Country = "INDONESIA",            Respondents = sum(FACE.survey.IDs$Country == "INDONESIA"),            Polling.Stations = sum(Polling.Stations$Country == "Indonesia"| Polling.Stations$Country == "Malaysia"| Polling.Stations$Country == "Singapore")),
data.frame(Country = "IRELAND",              Respondents = sum(FACE.survey.IDs$Country == "IRELAND"),              Polling.Stations = sum(Polling.Stations$Country == "Ireland"| Polling.Stations$Country == "United Kingdom")),
data.frame(Country = "ISRAEL",               Respondents = sum(FACE.survey.IDs$Country == "ISRAEL"),               Polling.Stations = sum(Polling.Stations$Country == "Israel"| Polling.Stations$Country == "Egypt"| Polling.Stations$Country == "Jordan"| Polling.Stations$Country == "Lebanon")),
data.frame(Country = "ITALY",                Respondents = sum(FACE.survey.IDs$Country == "ITALY"),                Polling.Stations = sum(Polling.Stations$Country == "Italy"| Polling.Stations$Country == "Austria"| Polling.Stations$Country == "France"| Polling.Stations$Country == "Slovenia"| Polling.Stations$Country == "Switzerland")),
data.frame(Country = "JAPAN",                Respondents = sum(FACE.survey.IDs$Country == "JAPAN"),                Polling.Stations = sum(Polling.Stations$Country == "Japan"| Polling.Stations$Country == "Korea, Republic of")),
data.frame(Country = "LATVIA",               Respondents = sum(FACE.survey.IDs$Country == "LATVIA"),               Polling.Stations = sum(Polling.Stations$Country == "Latvia"| Polling.Stations$Country == "Estonia"| Polling.Stations$Country == "Lithuania"| Polling.Stations$Country == "Russian Federation")),
data.frame(Country = "LUXEMBOURG",           Respondents = sum(FACE.survey.IDs$Country == "LUXEMBOURG"),           Polling.Stations = sum(Polling.Stations$Country == "Luxembourg"| Polling.Stations$Country == "Belgium"| Polling.Stations$Country == "France"| Polling.Stations$Country == "Germany")),
data.frame(Country = "MALTA",                Respondents = sum(FACE.survey.IDs$Country == "MALTA"),                Polling.Stations = sum(Polling.Stations$Country == "Malta"| Polling.Stations$Country == "Italy")),
data.frame(Country = "MEXICO",               Respondents = sum(FACE.survey.IDs$Country == "MEXICO"),               Polling.Stations = sum(Polling.Stations$Country == "Mexico"| Polling.Stations$Country == "United States")),
data.frame(Country = "MONACO",               Respondents = sum(FACE.survey.IDs$Country == "MONACO"),               Polling.Stations = sum(Polling.Stations$Country == "France"| Polling.Stations$Country == "Italy")),
data.frame(Country = "NETHERLANDS",          Respondents = sum(FACE.survey.IDs$Country == "NETHERLANDS"),          Polling.Stations = sum(Polling.Stations$Country == "Netherlands"| Polling.Stations$Country == "Luxembourg"| Polling.Stations$Country == "Belgium"| Polling.Stations$Country == "Germany")),
data.frame(Country = "NEW ZEALAND",          Respondents = sum(FACE.survey.IDs$Country == "NEW ZEALAND"),          Polling.Stations = sum(Polling.Stations$Country == "New Zealand" | Polling.Stations$Country == "Australia")),
data.frame(Country = "NORWAY",               Respondents = sum(FACE.survey.IDs$Country == "NORWAY"),               Polling.Stations = sum(Polling.Stations$Country == "Norway"| Polling.Stations$Country == "Sweden"| Polling.Stations$Country == "Russian Federation")),
data.frame(Country = "PARAGUAY",             Respondents = sum(FACE.survey.IDs$Country == "PARAGUAY"),             Polling.Stations = sum(Polling.Stations$Country == "Brazil"| Polling.Stations$Country == "Argentina"| Polling.Stations$Country == "Uruguay")),
data.frame(Country = "PHILIPPINES",          Respondents = sum(FACE.survey.IDs$Country == "PHILIPPINES"),          Polling.Stations = sum(Polling.Stations$Country == "Philippines")),
data.frame(Country = "POLAND",               Respondents = sum(FACE.survey.IDs$Country == "POLAND"),               Polling.Stations = sum(Polling.Stations$Country == "Poland"| Polling.Stations$Country == "Czechia"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "Lithuania"| Polling.Stations$Country == "Ukraine"| Polling.Stations$Country == "Russian Federation")),
data.frame(Country = "PORTUGAL",             Respondents = sum(FACE.survey.IDs$Country == "PORTUGAL"),             Polling.Stations = sum(Polling.Stations$Country == "Portugal"| Polling.Stations$Country == "Spain")),
data.frame(Country = "RUSSIA",               Respondents = sum(FACE.survey.IDs$Country == "RUSSIA"),               Polling.Stations = sum(Polling.Stations$Country == "Russian Federation"| Polling.Stations$Country == "Azerbaijan"| Polling.Stations$Country == "China"| Polling.Stations$Country == "Estonia"| Polling.Stations$Country == "Georgia"| Polling.Stations$Country == "Kazakhstan"| Polling.Stations$Country == "Latvia"| Polling.Stations$Country == "Norway"| Polling.Stations$Country == "Ukraine")),
data.frame(Country = "SINGAPORE",            Respondents = sum(FACE.survey.IDs$Country == "SINGAPORE"),            Polling.Stations = sum(Polling.Stations$Country == "Singapore"| Polling.Stations$Country == "Indonesia"| Polling.Stations$Country == "Malaysia")),
data.frame(Country = "SLOVENIA",             Respondents = sum(FACE.survey.IDs$Country == "SLOVENIA"),             Polling.Stations = sum(Polling.Stations$Country == "Slovenia"| Polling.Stations$Country == "Austria"| Polling.Stations$Country == "Croatia"| Polling.Stations$Country == "Italy"| Polling.Stations$Country == "Hungary")),
data.frame(Country = "SOUTH AFRICA",         Respondents = sum(FACE.survey.IDs$Country == "SOUTH AFRICA"),         Polling.Stations = sum(Polling.Stations$Country == "South Africa"| Polling.Stations$Country == "Mozambique"| Polling.Stations$Country == "Namibia"| Polling.Stations$Country == "Swaziland")),
data.frame(Country = "SPAIN",                Respondents = sum(FACE.survey.IDs$Country == "SPAIN"),                Polling.Stations = sum(Polling.Stations$Country == "Spain" | Polling.Stations$Country == "Portugal" | Polling.Stations$Country == "France" | Polling.Stations$Country == "Morocco")),
data.frame(Country = "SWEDEN",               Respondents = sum(FACE.survey.IDs$Country == "SWEDEN"),               Polling.Stations = sum(Polling.Stations$Country == "Sweden"| Polling.Stations$Country == "Denmark"| Polling.Stations$Country == "Norway"| Polling.Stations$Country == "Russian Federation")),
data.frame(Country = "SWITZERLAND",          Respondents = sum(FACE.survey.IDs$Country == "SWITZERLAND"),          Polling.Stations = sum(Polling.Stations$Country == "Switzerland"| Polling.Stations$Country == "Austria"| Polling.Stations$Country == "Italy"| Polling.Stations$Country == "Germany"| Polling.Stations$Country == "France")),
data.frame(Country = "TAIWAN",               Respondents = sum(FACE.survey.IDs$Country == "TAIWAN"),               Polling.Stations = sum(Polling.Stations$Country == "Taiwan"| Polling.Stations$Country == "Myanmar"| Polling.Stations$Country == "Malaysia")),
data.frame(Country = "THAILAND",             Respondents = sum(FACE.survey.IDs$Country == "THAILAND"),             Polling.Stations = sum(Polling.Stations$Country == "Thailand"| Polling.Stations$Country == "China")),
data.frame(Country = "TURKEY",               Respondents = sum(FACE.survey.IDs$Country == "TURKEY"),               Polling.Stations = sum(Polling.Stations$Country == "Turkey"| Polling.Stations$Country == "Bulgaria"| Polling.Stations$Country == "Cyprus"| Polling.Stations$Country == "Greece"| Polling.Stations$Country == "Georgia"| Polling.Stations$Country == "Iran")),
data.frame(Country = "UKRAINE",              Respondents = sum(FACE.survey.IDs$Country == "UKRAINE"),              Polling.Stations = sum(Polling.Stations$Country == "Ukraine"| Polling.Stations$Country == "Hungary"| Polling.Stations$Country == "Poland"| Polling.Stations$Country == "Romania"| Polling.Stations$Country == "Russian Federation")),
data.frame(Country = "UNITED ARAB EMIRATES", Respondents = sum(FACE.survey.IDs$Country == "UNITED ARAB EMIRATES"), Polling.Stations = sum(Polling.Stations$Country == "United Arab Emirates"| Polling.Stations$Country == "Qatar"| Polling.Stations$Country == "Saudi Arabia")),
data.frame(Country = "UNITED KINGDOM",       Respondents = sum(FACE.survey.IDs$Country == "UNITED KINGDOM"),       Polling.Stations = sum(Polling.Stations$Country == "United Kingdom"| Polling.Stations$Country == "Ireland"| Polling.Stations$Country == "Netherlands"| Polling.Stations$Country == "Belgium"| Polling.Stations$Country == "France")),
data.frame(Country = "USA",                  Respondents = sum(FACE.survey.IDs$Country == "USA"),                  Polling.Stations = sum(Polling.Stations$Country == "United States"| Polling.Stations$Country == "Canada"| Polling.Stations$Country == "Mexico"))
)


# Adding multiplication: "number of respondents" x "number of polling stations"
distances.summary$Number.of.requests <- distances.summary$Respondents*distances.summary$Polling.Stations

# Adding total to the table
distances.summary <-
rbind(
  distances.summary,
  data.frame(Country = c(""), Respondents = c(""), Polling.Stations = c(""), Number.of.requests = c("")),
  data.frame(Country = "TOTAL", Respondents = c(""), Polling.Stations = c(""), Number.of.requests = sum(distances.summary$Number.of.requests))
)

# Output
names(distances.summary) <- c("Country", "Number.of.Respondents", "Number.of.Polling.Stations", "Number.of.requests")
distances.summary
#write.csv(distances.summary, file = "03-data/distances.summary .csv")

rm(distances.summary)



# CALCULATING DISTANCES ---------------------------------------------------

# Adding an identificator to polling stations data--for which countries there are distances needed
# (We are calculating distances only between respondents and "reasonably" distant polling stations--either in their country of residency
#  or neighbouring countries--to minimize the number of requests to Google Maps Server because it costs money)

# Adding reasonable countries with FIN polling stations aborad to the survey respondents database
Distances.Pairwise <- merge(FACE.survey.IDs, cntry.pairing, by.x = "Country", by.y = "Expat.Cntry")

# Adding addresses of all polling stations in the countries assigned in the previous step
Distances.Pairwise <- merge(Distances.Pairwise, Polling.Stations, by.x = "Polling.Cntry", by.y = "Country")

# Changing column names and order for easier orientation
names(Distances.Pairwise) <- c("Polling.Cntry", "Respondent.Country", "Respondent.id", "Respondent.Address", "Respondent.ZIP.area", "Respondent.Participation", "Respondent.Address_to_look_for", "Polling.City", "Polling.Place", "Polling.Address", "Polling.Address_to_look_for")
Distances.Pairwise <- Distances.Pairwise[c(3, 4, 5, 2, 7, 1, 8:11)]


# Now, the dataset "Distances.Pairwise" includes all pairwise combinations of respondents' addresses 
# and reasonably distant polling stations


# Calculation of distances
# Creating an empty data.frame
Distances <- data.frame(Respondent.ID = "",
                        Origin = "",
                        Destination = "",
                        Time = "",
                        Distance = "",
                        Status = "")

# Filling in the data.frame with data
library(gmapsdistance)

for(i in nrow(Distances):nrow(Distances.Pairwise)) {
  Respondent.ID <- Distances.Pairwise$Respondent.id[i]
  Origin <- Distances.Pairwise$Respondent.Address_to_look_for[i]
  Destination <- Distances.Pairwise$Polling.Address_to_look_for[i]
  Distance <- data.frame(gmapsdistance(origin = Distances.Pairwise$Respondent.Address_to_look_for[i], 
                                       destination = Distances.Pairwise$Polling.Address_to_look_for[i],
                                       mode = "driving", combinations = "pairwise", shape = "wide"))
  Distances[i, ] <- data.frame(Respondent.ID, Origin, Destination, Distance)
  
}

#View(Distances)

# Saving an emergency copy of the data
#write.csv(Distances, file = "03-data/Sensitive data - DO NOT SHARE/Distances-respondents-vs-polling_stations.csv")


# Cleaning objects not needed anymore
rm(Respondent.ID, Origin, Destination, Distance, i)


# Adding distances to the general dataset with addresses
# First creating a pairing identificator
Distances$Pairing <- paste(Distances$Respondent.ID, Distances$Origin, Distances$Destination, sep = " ")
Distances.Pairwise$Pairing <- paste(Distances.Pairwise$Respondent.id, Distances.Pairwise$Respondent.Address_to_look_for, Distances.Pairwise$Polling.Address_to_look_for, sep = " ")

# Pairing it together
Distances.Pairwise <- merge(Distances.Pairwise, Distances, by = "Pairing")

# Leaving only relevant columns and erasing "Distances" because it is not needed anymore
Distances.Pairwise <- Distances.Pairwise[c(12, 3:5, 10, 9, 8, 7, 15:17)]
rm(Distances, cntry.pairing, FACE.survey.IDs, Polling.Stations)




# CORRECTION OF PROBLEMATIC ADDRESSES OF RESPONDENTS -------------------------------------
# There are 10 respondents with problematic addresses for which only NAs were returned.
# These are manually corrected and distances are calculated again.

# Respondents with incorrect addresses are extracted
Missing.routes <- subset(Distances.Pairwise, Respondent.ID == "201641" | Respondent.ID == "202098" | Respondent.ID == "202776" | Respondent.ID == "204310" | 
                                             Respondent.ID == "204321" | Respondent.ID == "205068" | Respondent.ID == "206380" | Respondent.ID == "206455" | 
                                             Respondent.ID == "209103" | Respondent.ID == "209373")

# Correction of addresses
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "201641"]  <- "18 Ross Ave"    
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "202098"]  <- "Atrium Palace"
Missing.routes$Respondent.ZIP.area[Missing.routes$Respondent.ID == "202098"] <- "3032 Limassol"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "202776"]  <- "Metsa, Tutermaa kula"
Missing.routes$Respondent.ZIP.area[Missing.routes$Respondent.ID == "202776"] <- "Harku vald Harjumaa 76617"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "204310"]  <- "Anilevich"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "204321"]  <- "Berdyczewski Street 16/1"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "205068"]  <- "Via Moneta, 54"
Missing.routes$Respondent.ZIP.area[Missing.routes$Respondent.ID == "205068"] <- "22070 Carbonate CO"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "206380"]  <- ""
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "206455"]  <- "Avda. Juan Carlos No 37"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "209103"]  <- "Dumbarton Court Brixton Hill"
Missing.routes$Respondent.Address[Missing.routes$Respondent.ID == "209373"]  <- "Plimsoll House, Ashgrove Road, BS6 6LZ Bristol"

# Generating keys for search on Google Maps
Missing.routes$Respondent.Address_to_look_for <- paste(Missing.routes$Respondent.Address, Missing.routes$Respondent.ZIP.area, Missing.routes$Respondent.Country, sep = ", ")
Missing.routes$Polling.Address_to_look_for <- paste(Missing.routes$Polling.Address, Missing.routes$Polling.City, Missing.routes$Polling.Country, sep = ", ")

# Replacing " " with "+" because that is what has to be fed into Google Maps
Missing.routes$Respondent.Address_to_look_for <- gsub(" ", "+", Missing.routes$Respondent.Address_to_look_for, fixed = TRUE)
Missing.routes$Polling.Address_to_look_for <- gsub(" ", "+", Missing.routes$Polling.Address_to_look_for, fixed = TRUE)


# Getting the distances from Google Maps
# Creating an empty data.frame
Missing.routes.Distances <- data.frame(Respondent.ID = "",
                                       Origin = "",
                                       Destination = "",
                                       Time = "",
                                       Distance = "",
                                       Status = "")

# Filling in the data.frame with data
library(gmapsdistance)

for(i in nrow(Missing.routes.Distances):nrow(Missing.routes)) {
  Respondent.ID <- Missing.routes$Respondent.ID[i]
  Origin <- Missing.routes$Respondent.Address_to_look_for[i]
  Destination <- Missing.routes$Polling.Address_to_look_for[i]
  Distance <- data.frame(gmapsdistance(origin = Missing.routes$Respondent.Address_to_look_for[i], 
                                       destination = Missing.routes$Polling.Address_to_look_for[i],
                                       mode = "driving", combinations = "pairwise", shape = "wide"))
  Missing.routes.Distances[i, ] <- data.frame(Respondent.ID, Origin, Destination, Distance)
  
}
#View(Missing.routes.Distances)
rm(Respondent.ID, Origin, Destination, Distance, i)




# Adding distances to the general dataset with addresses
# First creating a pairing identificator
Missing.routes.Distances$Pairing <- paste(Missing.routes.Distances$Respondent.ID, Missing.routes.Distances$Origin, Missing.routes.Distances$Destination, sep = " ")
Missing.routes$Pairing <- paste(Missing.routes$Respondent.ID, Missing.routes$Respondent.Address_to_look_for, Missing.routes$Polling.Address_to_look_for, sep = " ")

# Pairing it together
Missing.routes.Distances <- merge(Missing.routes.Distances, Missing.routes, by = "Pairing")

# Leaving only relevant columns and renaming columns to achieve consistency
Missing.routes.Distances <- Missing.routes.Distances[c(2, 9:11, 15, 14, 13, 12, 5:7)]
names(Missing.routes.Distances) <- c("Respondent.ID", "Respondent.Address", "Respondent.ZIP.area", "Respondent.Country", "Polling.Address", "Polling.Place", "Polling.City", "Polling.Cntry", "Time", "Distance", "Status" )

# Erasing "Missing.routes" because it is not needed anymore
rm(Missing.routes)


# Removing respondents with problematic addresses from the "Distance.Pairwise" data
Distances.Pairwise <- subset(Distances.Pairwise, Respondent.ID != "201641" & Respondent.ID != "202098" & Respondent.ID != "202776" & Respondent.ID != "204310" & 
                                                 Respondent.ID != "204321" & Respondent.ID != "205068" & Respondent.ID != "206380" & Respondent.ID != "206455" & 
                                                 Respondent.ID != "209103" & Respondent.ID != "209373")

# Adding corrected distance information
Distances.Pairwise <- rbind(Distances.Pairwise, Missing.routes.Distances)
rm(Missing.routes.Distances)

# Saving the backup file with all "reasonable" distances
#write.csv(Distances.Pairwise, file = "03-data/Sensitive data - DO NOT SHARE/Distances-respondents-vs-polling_stations.csv")




# CHECK: ARE THERE ANY POLLING STATIONS WITH WRONG ADDRESS? --------
library(dplyr)
Polling.Stations.Check <- 
  Distances.Pairwise %>% 
  group_by(Polling.Address) %>% 
  count(is.na(Distance))

names(Polling.Stations.Check) <- c("Polling.Address", "Distance.NA", "n")

# Checking which polling stations returned only NAs
Polling.Stations.Check <- merge(subset(Polling.Stations.Check, Distance.NA == TRUE), subset(Polling.Stations.Check, Distance.NA == FALSE), by = "Polling.Address", all = TRUE)

# Subsetting those polling stations which returned only NAs
subset(Polling.Stations.Check, is.na(Distance.NA.y))$Polling.Address

# Selecting Respondents paired with these Polling stations
Respondents.vs.Polling.Check <- 
subset(Distances.Pairwise, Polling.Address == "17 Bansidhar Marg, Bishalnagar Chowk" | 
                           Polling.Address == "18th floor, Kyobo Building, Jongno 1, Jongno-gu, Seoul 03154" | 
                           Polling.Address == "200 Tambudki Road, Arpora, Baredz, Goa 403518" | 
                           Polling.Address == "24th Floor, Lotte Center Hanoi, 54 Lieu Giai St." | 
                           Polling.Address == "Camp Mira, Camp Marmal / TAAC-N RS" |
                           Polling.Address == "E-3 Nyaya Marg (sis��nk�ynti Chandra Gupta Marg), New Delhi 110021" | 
                           Polling.Address == "Finland Trade Center, F/5 Somerset Chancellor Court, , 21-23 Nguyen Thi Minh Khai Street, Distr.1" |
                           Polling.Address == "Nordic House, 3 Pyay Road, 6 miles, Hlaing Township, Yangon" | 
                           Polling.Address == "Rua 31 de Janeiro, 12 E 4. andar, letra R, , 9050-011 Funchal" | 
                           Polling.Address == "Street 10, Lane 1, House 728, Wazir Akbar Khan")

#View(Respondents.vs.Polling.Check)
# As can be seen, all polling stations returning only NAs are (a) not frequently used, and (b) in neighbouring countries or overseas territories,
# hence, there is no reason to worry that Google Maps could not them correctly locate. Rather, it's the lack of reasonable connection which
# makes them to return NAs.
rm(Polling.Stations.Check, Respondents.vs.Polling.Check)




# SELECTING THE SHORTEST FROM ALL REASONABLE DISTANCES --------------------

# Switching the distances and time to numeric (otherwise minimum cannot be identified)
Distances.Pairwise$Distance <- as.numeric(Distances.Pairwise$Distance)
Distances.Pairwise$Time <- as.numeric(Distances.Pairwise$Time)


# Selecting the smallest distance from the polling station
library(dplyr)
Distances.data <- 
  na.omit(Distances.Pairwise) %>% 
  group_by(Respondent.ID) %>% 
  slice(which.min(Distance))

#View(Distances.data)
# Saving a backup copy on local disc
#write.csv(Distances.data, file = "03-data/Sensitive data - DO NOT SHARE/Distances - full address.csv")

# Anonymizing the data
Distances.data.anon <- Distances.data[c(1, 4:10)]
write.csv(Distances.data.anon, file = "03-data/Distances - anonymized.csv")
# So, the file "03-data/Distances - anonymized.csv" includes the shortest distances between FACE respondents and reasonably located
# polling stations and will be imported into the main part of the code

rm(Distances.data, Distances.Pairwise)